
global Truck "/Users/yuanningliang/Truck/"

cd "$Truck/processed_data"


/* full sample of truck ever inspected */
 /* use only data after 1996 */
foreach j of numlist 1996 {
	insheet using "InspectionData/Insp_Unit_Pub_`j'.csv", clear
	keep inspection_id insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number
	destring inspection_id, force replace
	drop if insp_unit_license == ""  & insp_unit_vehicle_id_number == ""
	bysort insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number: keep if _n == 1
	save "InspectionData/Insp_Unit_`j'", replace  
}


* before 1994 (including), all VIN are missing, so they are recorded as . (numeric)
* after 1996, there are some records on VIN (50%), and they are all string, so merge using license plate number
* after 2010, only 10% of VIN is missing

foreach j of numlist 1997/2010 {
	insheet using "InspectionData/Insp_Unit_Pub_`j'.csv", clear
	keep inspection_id insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number
	destring inspection_id, force replace
	drop if insp_unit_license == ""  & insp_unit_vehicle_id_number == ""
	bysort insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number: keep if _n == 1
	save "InspectionData/Insp_Unit_`j'", replace  

	* merge in previous years using insp_unit_license_state & insp_unit_license
	* because VIN are missing a lot for early years, but in later years (after 2010), most obv have both license & VIN recorded
	local lastyear = `j' - 1
	foreach i of numlist 1996/`lastyear' {
		merge m:m insp_unit_license insp_unit_license_state using "InspectionData/Insp_Unit_`i'", force
		keep if _merge == 1 // new (not inspected) trucks in year i (previous years)
		drop _merge
	}
	save "InspectionData/Insp_Unit_`j'", replace
}


foreach j of numlist 2011/2018 {
	insheet using "InspectionData/Insp_Unit_Pub_`j'.csv", clear
	keep inspection_id insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number
	destring inspection_id, force replace
	drop if insp_unit_license == ""  & insp_unit_vehicle_id_number == ""
	bysort insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number: keep if _n == 1
	save "InspectionData/Insp_Unit_`j'", replace  

	* merge in previous years using insp_unit_license_state & insp_unit_license
	* because VIN are missing a lot for early years, in later years (after 2010), most obv have both license & VIN recorded
	local lastyear = `j' - 1
	foreach i of numlist 1996/2009 {
		merge m:m insp_unit_license insp_unit_license_state using "InspectionData/Insp_Unit_`i'", force
		keep if _merge == 1 // new (not inspected) trucks in year i (previous years)
		drop _merge
	}

	foreach i of numlist 2010/`lastyear' {
		merge m:m insp_unit_vehicle_id_number using "InspectionData/Insp_Unit_`i'", force
		keep if _merge == 1 // new (not inspected) trucks in year i (previous years)
		drop _merge
	}
	save "InspectionData/Insp_Unit_`j'", replace
}


/* inspection history */
* step 1: find all inspection_id of sampled trucks from the unit table

use "InspectionData/Insp_Unit_1996", clear
foreach i of numlist 1997/2018 {
	append using "InspectionData/Insp_Unit_`i'"
}
compress
save "InspectionData/temp/Insp_Unit_100pct", replace


* generate 2 truck list with no duplicates or missing in license / VIN
use "InspectionData/temp/Insp_Unit_100pct", clear
drop inspection_id
duplicates drop insp_unit_license insp_unit_license_state, force
drop if insp_unit_license == ""

gen drop = 1 if strpos(insp_unit_license,"NONE")>0
replace drop = 1 if strpos(insp_unit_license,"00000")>0
replace drop = 1 if strpos(insp_unit_license,"TEMP")>0
replace drop = 1 if strpos(insp_unit_license,"UNK")>0
replace drop = 1 if strpos(insp_unit_license,"UN")>0
replace drop = 1 if strpos(insp_unit_license,"**")>0
replace drop = 1 if strpos(insp_unit_license,"--")>0
replace drop = 1 if strpos(insp_unit_license,"!")>0
replace drop = 1 if strpos(insp_unit_license,",")>0
replace drop = 1 if strpos(insp_unit_license,"#")>0
replace drop = 1 if strpos(insp_unit_license,"NEW")>0
replace drop = 1 if strpos(insp_unit_license,"FLASH")>0
replace drop = 1 if insp_unit_license == "0" | insp_unit_license == "00"  | insp_unit_license == "000" | insp_unit_license == "-"

drop if drop == 1
drop drop 

save "Truck_100pct_nodup_lic", replace


use "InspectionData/temp/Insp_Unit_100pct", clear
drop inspection_id
duplicates drop insp_unit_vehicle_id_number, force

replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UNKNOWN" | insp_unit_vehicle_id_number == "SAME" | insp_unit_vehicle_id_number == "UNREADABLE" | insp_unit_vehicle_id_number == "NO VIN" | insp_unit_vehicle_id_number == "RR" | insp_unit_vehicle_id_number == "DRIVER/CARRIER"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKNOWN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UNKOWN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "00000000000000000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000000000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "0"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "."
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "//"	
drop if insp_unit_vehicle_id_number == ""

gen drop = 1 if strpos(insp_unit_vehicle_id_number,"NONE")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"0000000")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"TEMP")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"UNK")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"**")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"--")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"- - -")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"...")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"///")>0
replace drop = 1 if strlen(insp_unit_vehicle_id_number)<5
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"#")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"NOT")>0
replace drop = 1 if strpos(insp_unit_vehicle_id_number,"white")>0
drop if insp_unit_vehicle_id_number == "CARRIER"

drop if drop == 1
drop drop 

save "Truck_100pct_nodup_VIN", replace



* find all inspection_id of sampled trucks from the unit table

foreach i of numlist 1996/2018 {   
	insheet using "InspectionData/Insp_Unit_Pub_`i'.csv", clear
	
	keep inspection_id insp_unit_license insp_unit_license_state insp_unit_decal insp_unit_decal_number 
	
	* first, merge using vehicle license
	merge m:1 insp_unit_license insp_unit_license_state using "TruckSample/Truck_100pct_nodup_lic", force keep(match) nogen

	save "InspectionData/temp/Insp_Trucklic_`i'_100pct", replace
}

	* second, merge using VIN for years 1996/2018 (before 1994, all VIN are missing)
foreach i of numlist 1996/2018 {   
	insheet using "InspectionData/Insp_Unit_Pub_`i'.csv", clear

	keep inspection_id insp_unit_vehicle_id_number insp_unit_decal insp_unit_decal_number

	* merge using VIN
	merge m:1 insp_unit_vehicle_id_number using "TruckSample/Truck_100pct_nodup_VIN", keep(match) nogen

	save "InspectionData/temp/Insp_TruckVIN_`i'_100pct", replace
}


* combine two methods of merging, drop duplicates
foreach i of numlist 1996/2018 {   
	use "InspectionData/temp/Insp_Trucklic_`i'_100pct", clear
	append using "InspectionData/temp/Insp_TruckVIN_`i'_100pct"
	
	duplicates drop inspection_id , force
		// drop inspections merged using both methods
		// note that there are duplicates in the using data, because one inspection_id records both the truck and trailor (2 entries)
	gen insp_year = `i'
	
	save "InspectionData/temp/Insp_Truck_`i'_100pct", replace
}

use "InspectionData/temp/Insp_Truck_1996_100pct", clear
foreach i of numlist 1997/2018 {   
	append using "InspectionData/temp/Insp_Truck_`i'_100pct"
}
* drop year
save "InspectionData/Insp_Truck_100pct", replace


* erase temp files
foreach i of numlist 1996/2018 {
	erase "InspectionData/temp/Insp_Trucklic_`i'_100pct.dta"
	erase "InspectionData/temp/Insp_TruckVIN_`i'_100pct.dta"
	erase "InspectionData/temp/Insp_Truck_`i'_100pct.dta"
}





/* a list of states within 9 divisions: (separately for the 6 states (CA, TX, OR, NY, & MI, IL ))
Region 1: Northeast
	Division 1: New England (Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, and Vermont)
	Division 2: Mid-Atlantic (New Jersey, New York, and Pennsylvania)
Region 2: Midwest (Prior to June 1984, the Midwest Region was designated as the North Central Region.)[7]
	Division 3: East North Central (Illinois, Indiana, Michigan, Ohio, and Wisconsin)
	Division 4: West North Central (Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, and South Dakota)
Region 3: South
	Division 5: South Atlantic (Delaware, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia, District of Columbia, and West Virginia)
	Division 6: East South Central (Alabama, Kentucky, Mississippi, and Tennessee)
	Division 7: West South Central (Arkansas, Louisiana, Oklahoma, and Texas)
Region 4: West
	Division 8: Mountain (Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah, and Wyoming)
	Division 9: Pacific (Alaska, California, Hawaii, Oregon, and Washington)
*/
local d1 "CT MA	ME	NH	RI	VT"
local d2 "NJ	PA"
local d3 "IN	OH	WI"
local d4 "IA	KS	MN	MO	ND	NE	SD"
local d5 "DC	DE	FL	GA	MD	NC	SC	VA	WV"
local d6 "AL	KY	MS	TN"
local d7 "AR	LA	OK"
local d8 "AZ	CO	ID	MT	NM	NV	UT	WY"
local d9 "AK	HI	WA"


foreach k of numlist 1/9  {

* step 2: extract inspections happened in each division
foreach i of numlist 1996/2018 { 
  
	insheet using "InspectionData/Insp_Pub_`i'.csv", clear
	
	keep insp_date insp_start insp_end ///
		insp_facility inspection_id post_acc_ind dot_number ///
		report_state county_code location 
	
	* keep only division d`k'
	gen division = .
	foreach j of local d`k' {
		replace division = 1 if report_state == "`j'"
	}
	keep if division == 1
	
	keep if insp_facility == "F" // keep only fixed station inspections
	
	gen insp_year = `i'

	merge 1:1 inspection_id insp_year using "InspectionData/Insp_Truck_100pct", keep(match) nogen

	tostring post_acc_ind, replace
	replace post_acc_ind = "" if post_acc_ind == "."
	
	drop if post_acc_ind == "Y"
	drop insp_facility post_acc_ind division

	save "InspectionData/temp/Insp_d`k'_`i'.dta", replace
}
use "InspectionData/temp/Insp_d`k'_1996.dta", clear
foreach i of numlist 1997/2018 {   // inspection time info
	append using "InspectionData/temp/Insp_d`k'_`i'.dta"
}
rename report_state state
gen str3 COUNTYFP = string(county_code, "%03.0f")

merge m:1 state COUNTYFP using "CensusData/countyfips", keep(master match) nogen
egen insp_countyfips = concat(STATEFP COUNTYFP)
rename state insp_state
drop COUNTYFP county_code STATEFP countyfips

compress
save "InspectionData/Insp_d`k'.dta", replace  

/* erase temp files
foreach i of numlist 1996/2018 {
	erase "InspectionData/temp/Insp_d`k'_`i'.dta" */

}

* 6states
foreach i of numlist 1996/2018 {   
	insheet using "InspectionData/Insp_Pub_`i'.csv", clear
	keep insp_date insp_start insp_end ///
		insp_facility inspection_id post_acc_ind dot_number ///
		report_state county_code location 
	
	keep if report_state == "CA" | report_state == "TX" | report_state == "OR" | report_state == "NY" | report_state == "MI" | report_state == "IL"
	keep if insp_facility == "F" // keep only fixed station inspections
	
	gen insp_year = `i'

	merge 1:1 inspection_id insp_year using "InspectionData/Insp_Truck_100pct", keep(match) nogen

	tostring post_acc_ind, replace
	replace post_acc_ind = "" if post_acc_ind == "."
	
	drop if post_acc_ind == "Y"
	drop insp_facility post_acc_ind

	save "InspectionData/temp/Insp_6state`i'.dta", replace
}
use "InspectionData/temp/Insp_6state1996.dta", clear
foreach i of numlist 1997/2018 {   // inspection time info
	append using "InspectionData/temp/Insp_6state`i'.dta"
}
rename report_state state
gen str3 COUNTYFP = string(county_code, "%03.0f")

merge m:1 state COUNTYFP using "CensusData/countyfips", keep(master match) nogen
egen insp_countyfips = concat(STATEFP COUNTYFP)
rename state insp_state
drop COUNTYFP county_code STATEFP countyfips

compress
save "InspectionData/Insp_6state.dta", replace  



/* all crash history */

foreach i of numlist 1996/2018 {   
	insheet using "CrashData/CrashMaster_`i'.csv", clear

	keep report_date report_time report_number crash_id dot_number report_seq_no /// id
		 vehicle_license_number vehicle_lic_state vehicle_identification_number /// veh license & VIN
		 report_state county_code location /// location
		 fatalities injuries tow_away vehicles_in_accident /// severness
		 road_surface_condition_id gvw_rating_id weather_condition vehicle_hazmat_placard light_condition hazmat_released // external conditions
		 
			// report_number corresponds to a single crash event; 
			// crash_id is unique with each truck involved.

	rename vehicle_lic_state insp_unit_license_state
	rename vehicle_license_number insp_unit_license
	rename vehicle_identification_number insp_unit_vehicle_id_number

	compress
	save "CrashData/temp/Crash_`i'", replace
}

use "CrashData/temp/Crash_1996", clear
foreach i of numlist 1997/2018 {   
	append using "CrashData/temp/Crash_`i'", force
}

* date and time of crash
gen crash_year = floor(report_date/10000)
gen crash_month = floor((report_date - crash_year*10000)/100)
gen crash_day = report_date - crash_year*10000 - crash_month*100

replace report_time = . if report_time == 9999  // some of the report_time is 9999, indicates unknown report_time (.2%)
gen crash_hr = floor(report_time/100)   
gen crash_min = report_time - crash_hr * 100

gen date = mdy(crash_month,crash_day,crash_year)
gen crash_date = date
gen crash_time = mdyhms(crash_month,crash_day,crash_year,crash_hr,crash_min,0)

* county of crash
rename report_state state
gen str3 COUNTYFP = string(county_code, "%03.0f")

merge m:1 state COUNTYFP using "CensusData/countyfips", keep(master match) nogen
gegen crash_countyfips = concat(STATEFP COUNTYFP)
rename state crash_state
drop COUNTYFP county_code STATEFP countyfips  crash_hr crash_min

drop if strlen(insp_unit_vehicle_id_number)>20 // 8, misrecording

compress
save "CrashData/Crash_all", replace

	
	
 /* extract Crash Event ID */
foreach i of numlist 1996/2018{
	insheet using "CrashData/CrashEvent_`i'.csv", clear
	keep crash_id event_id
	bysort crash_id: gen byte event_nbr = _n
	reshape wide event_id, i(crash_id) j(event_nbr)
			// crash_id is unique with each truck involved.
	merge 1:1 crash_id using "CrashData/Crash_all", nogen keep(match)
	
	compress
	save "CrashData/temp/CrashEvent_ID_`i'", replace
}

use "CrashData/temp/CrashEvent_ID_1996",clear
foreach i of numlist 1997/2018{
	append using "CrashData/temp/CrashEvent_ID_`i'"
}
keep crash_id event_id1 event_id2 event_id3 event_id4
save "CrashData/CrashEvent_all", replace

/* erase temp files: 
foreach i of numlist 1996/2018{
	erase "CrashData/temp/CrashEvent_ID_`i'_d`k'"
	*/



* merge into crash event 
use "CrashData/Crash_all",clear
merge 1:1 crash_id using "CrashData/CrashEvent_all", nogen

compress
save "CrashData/Crash_all", replace




* use VIN
use "CrashData/Crash_all",clear

duplicates drop report_date report_time insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number, force 
	// if report_date & time & license are all the same, this is a single crash event

bysort report_date insp_unit_license insp_unit_license_state location: egen report_time_earlier = min(report_time)
keep if report_time_earlier == report_time
	// if the report time is different but close, and the crashes happened at the same location, this is also a single crash event	
	// keep the earlier one in terms of report_time
drop report_time_earlier

bysort date insp_unit_vehicle_id_number: egen report_time_earlier = min(report_time)
keep if report_time_earlier == report_time
	// drop location info FOR NOW: there are multiple crashes happen for the same truck on the same day but in different counties

gen crash = 1
collapse (sum) crash fatalities injuries vehicles_in_accident, by(crash_state crash_countyfips report_date report_time crash_time insp_unit_vehicle_id_number crash_year crash_month crash_day crash_date date event_id1 event_id2 event_id3 event_id4 road_surface_condition_id gvw_rating_id weather_condition light_condition)

replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "SAME" | insp_unit_vehicle_id_number == "UNREADABLE" | insp_unit_vehicle_id_number == "NO VIN" | insp_unit_vehicle_id_number == "RR" | insp_unit_vehicle_id_number == "DRIVER/CARRIER"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKNOWN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "0"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "."
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "//"	
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "CARRIER" | insp_unit_vehicle_id_number == "DRIVER"

replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NONE")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"0000000")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"TEMP")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"UNK")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"**")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"--")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"- - -")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"...")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"///")>0
replace insp_unit_vehicle_id_number = "" if strlen(insp_unit_vehicle_id_number)<5
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"#")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NOT")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"white")>0

drop if insp_unit_vehicle_id_number == ""
duplicates drop date insp_unit_vehicle_id_number, force // some dot_numbers are different

compress
save "CrashData/Crash_allVIN", replace

log using "$Truck/outputs/Log/division_100pct/construct_100pct", append text name(construct_100pct)


* use license
use "CrashData/Crash_all",clear

duplicates drop report_date report_time insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number, force 
	// if report_date & time & license are all the same, this is a single crash event

bysort report_date insp_unit_license insp_unit_license_state location: egen report_time_earlier = min(report_time)
keep if report_time_earlier == report_time
	// if the report time is different but close, and the crashes happened at the same location, this is also a single crash event	
	// keep the earlier one in terms of report_time
drop report_time_earlier

bysort date insp_unit_license insp_unit_license_state: egen report_time_earlier = min(report_time)
keep if report_time_earlier == report_time
	// drop location info FOR NOW: there are multiple crashes happen for the same truck on the same day but in different counties

gen crash = 1
collapse (sum) crash fatalities injuries vehicles_in_accident, by(crash_state crash_countyfips report_date report_time crash_time dot_number insp_unit_license insp_unit_license_state crash_year crash_month crash_day crash_date date event_id1 event_id2 event_id3 event_id4 road_surface_condition_id gvw_rating_id weather_condition light_condition)

duplicates drop date insp_unit_license insp_unit_license_state, force // some dot_numbers are different

drop if insp_unit_license == ""

compress
save "CrashData/Crash_alllic", replace



local division "d1 d2 d3 d4 d5 d6 d7 d8 d9 6state"

foreach k of local division {

/* Merge crash with inspection */

* step 1: Inspection Event Days Window

* create a balanced event date panel for all inspections
* first dataset: VIN all exist

use "InspectionData/Insp_`k'", clear
keep if insp_unit_vehicle_id_number ~= ""

replace insp_year = floor(insp_date/10000)
gen insp_month = floor((insp_date - insp_year*10000)/100)
gen insp_day = insp_date - insp_year * 10000 - insp_month * 100

expand 31 
bysort insp_unit_vehicle_id_number inspection_id insp_date: gen event_time = _n - 16

gen date = mdy(insp_month,insp_day,insp_year)
replace date = date + event_time
replace insp_date = . if event_time ~= 0
save "EventData/insp_panel/insp_event_time_`k'_VIN.dta", replace

* second dataset: VIN all missing	
use "InspectionData/Insp_`k'", clear
keep if insp_unit_vehicle_id_number == ""

replace insp_year = floor(insp_date/10000)
gen insp_month = floor((insp_date - insp_year*10000)/100)
gen insp_day = insp_date - insp_year * 10000 - insp_month * 100

expand 31 
bysort insp_unit_license insp_unit_license_state inspection_id insp_date: gen event_time = _n - 16	

gen date = mdy(insp_month,insp_day,insp_year)
replace date = date + event_time
replace insp_date = . if event_time ~= 0
save "EventData/insp_panel/insp_event_time_`k'_lic.dta", replace

}	



* inspection-crash event time correspondence
local division "d1 d2 d3 d4 d5 d6 d7 d8 d9 6state"


foreach k of local division {

*VIN
use "EventData/insp_panel/insp_event_time_`k'_VIN.dta", clear

drop insp_unit_decal insp_unit_decal_number insp_state insp_date location
 
local varlist " road_surface_condition_id gvw_rating_id weather_condition_id light_condition_id report_date report_time crash_time crash_countyfips event_id1 event_id2 event_id3 event_id4 crash fatalities injuries vehicles_in_accident"

merge m:1 date insp_unit_vehicle_id_number using "CrashData/Crash_allVIN", nogen keep(master match) keepusing(`varlist')

foreach var of local varlist {
rename `var' `var'VIN
}

merge m:1 date insp_unit_license insp_unit_license_state using "CrashData/Crash_alllic", nogen keep(master match) keepusing(`varlist')

foreach var of varlist road_surface_condition_id gvw_rating_id weather_condition_id light_condition_id report_date report_time crash_time event_id1 event_id2 event_id3 event_id4 crash fatalities injuries vehicles_in_accident {
replace `var'VIN = `var' if `var'VIN == .
}
replace crash_countyfipsVIN = crash_countyfips if crash_countyfipsVIN == ""

drop `varlist'
foreach var of local varlist {
rename `var'VIN `var'
}

compress
save "EventData/temp/Insp_Crash_Event_`k'_VIN.dta", replace


*lic
use "EventData/insp_panel/insp_event_time_`k'_lic.dta", clear

drop insp_unit_decal insp_unit_decal_number insp_state insp_date location

merge m:1 date insp_unit_license insp_unit_license_state using "CrashData/Crash_alllic", nogen keep(master match) keepusing(`varlist')

save "EventData/temp/Insp_Crash_Event_`k'_lic.dta", replace


* append two datasets
use "EventData/temp/Insp_Crash_Event_`k'_lic.dta",clear
append using "EventData/temp/Insp_Crash_Event_`k'_VIN.dta"


gen inspection = 1 if event_time == 0
replace inspection = 0 if inspection == .
replace crash = 0 if crash == .
replace injuries = 0 if injuries == .
replace fatalities = 0 if fatalities == .


* gen dep & indep var
gen insp_start_hr = floor(insp_start_time/100)
gen insp_start_min = insp_start_time - insp_start_hr * 100

gen insp_time = mdyhms(insp_month,insp_day,insp_year,insp_start_hr,insp_start_min,0)

drop insp_start_hr insp_start_min

gen post_insp = (event_time >= 0 )

gen year = year(date)
gen month = month(date)
gen dayofweek = dow(date)

compress
save "EventData/division_100pct/Insp_Crash_Event_`k'.dta", replace


* erase temp files
*erase "EventData/temp/Insp_Crash_Event_d`k'_lic.dta"
*erase "EventData/temp/Insp_Crash_Event_d`k'_VIN.dta"

}


* append all divisions
use "EventData/division_100pct/Insp_Crash_Event_6state.dta", clear

local division "d1 d2 d3 d4 d5 d6 d7 d8 d9"
foreach k of local division {
	append using "$Truck/processed_data/EventData/division_100pct/Insp_Crash_Event_`k'.dta"	
}

compress
save "$Truck/processed_data/EventData/division_100pct/temp/Insp_Crash_100pct_pre.dta", replace


* gen regression variables
use "$Truck/processed_data/EventData/division_100pct/temp/Insp_Crash_100pct_pre.dta", clear

gegen new_license = group(insp_unit_license_state insp_unit_license)
replace new_license = 0 if new_license == .
replace new_license = - new_license  

gegen new_VIN = group(insp_unit_vehicle_id_number)
replace new_VIN = new_license if insp_unit_vehicle_id_number == ""

gegen new_group_id = group(new_VIN inspection_id insp_year)
replace new_group_id = 0 if new_group_id == .
replace new_group_id = . if event_time == .


* outside event window insp indicator
bysort new_VIN: gegen last_insp = max(mdy(insp_month, insp_day, insp_year))
bysort new_VIN: gegen first_insp = min(mdy(insp_month, insp_day, insp_year))

gen insp_p = 0
gen insp_m = 0
replace insp_p = 1 if mdy(insp_month, insp_day, insp_year) > first_insp
replace insp_m = 1 if mdy(insp_month, insp_day, insp_year) < last_insp


* drop events within 3,6,12,18,24 hours of inspection
foreach t of numlist 12 18 {
	local hr_`t' = mdyhms(1,1,2000,`t',0,0)-mdyhms(1,1,2000,0,0,0)

	gen temp_crash_`t'hr = 1 if crash_time >= insp_time - `hr_`t'' & crash_time < insp_time
	bysort new_group_id: gegen crash_event_`t'hr = max(temp_crash_`t'hr)
}

drop temp_crash*

compress
save "$Truck/processed_data/EventData/division_100pct/Insp_Crash_100pct_pre.dta", replace




* gen 1-day and 2-day bins

local insp_varlist year month date dayofweek insp_p insp_m event_time post_insp new_VIN new_group_id inspection insp_time insp_countyfips 
local crash_varlist crash event_id1 event_id2 event_id3 event_id4 vehicles_in_accident injuries fatalities crash_event* crash_time 

use `insp_varlist' `crash_varlist' using "EventData/division_100pct/Insp_Crash_100pct_pre.dta", clear nolabel

* generate inspection lead and lags
tsset new_group_id event_time

quietly forvalues i=1/14 { 
gen byte F`i'insp=F`i'.inspection
replace F`i'insp = 0 if F`i'insp == .

gen byte L`i'insp=L`i'.inspection
replace L`i'insp = 0 if L`i'insp == .
}


* 2-day bins
* combine into 2-day bins to increase power [(F14,F13)...(F2,F1),(0,L1),(L2,L3)...(L12,L13)
egen insp_m0_1=rowmax(inspection L1insp)
replace insp_m0_1=0 if insp_m0_1==.

quietly forvalues i=2(2)12 {
local k=`i'+1
egen insp_m`i'_`k'=rowmax(L`i'insp L`k'insp)
replace insp_m`i'_`k'=0 if insp_m`i'_`k'==.
}

quietly forvalues i=1(2)13 {
local k=`i'+1
egen insp_p`i'_`k'=rowmax(F`i'insp F`k'insp)
replace insp_p`i'_`k'=0 if insp_p`i'_`k'==.
}	
		
compress
save "EventData/division_100pct/Insp_Crash_100pct.dta", replace





* extract inspection outcomes

foreach i of numlist 1996/2018 { 
  
	di "*** year `i' ***"
	
	insheet using "InspectionData/Insp_Pub_`i'.csv", clear
	
	keep insp_facility inspection_id post_acc_ind ///
		alcohol_control_sub drug_intrdctn_search drug_intrdctn_arrests size_weight_enf traffic_enf ///
		viol_total oos_total driver_viol_total driver_oos_total vehicle_viol_total vehicle_oos_total hazmat_viol_total hazmat_oos_total
	
	keep if insp_facility == "F"
	
	foreach var of varlist size_weight_enf traffic_enf post_acc_ind drug_intrdctn_arrests drug_intrdctn_search {
	
		tostring `var', replace
		replace `var' = "" if `var' == "."

	}
	
	drop if post_acc_ind == "Y"
	drop insp_facility post_acc_ind
	
	gen insp_year = `i'

	merge 1:1 inspection_id insp_year using "InspectionData/Insp_Truck_100pct", keep(match) nogen
	
	compress
	save "InspectionData/temp/Insp_outcome_`i'.dta", replace
}

use "InspectionData/temp/Insp_outcome_1996.dta", clear
foreach i of numlist 1997/2018 {  
	append using "InspectionData/temp/Insp_outcome_`i'.dta"
}

destring drug_intrdctn_arrests, force gen(drug_intrdctn_arrests_n) // all numeric
drop drug_intrdctn_arrests

foreach var of varlist alcohol_control_sub drug_intrdctn_search size_weight_enf traffic_enf insp_unit_decal {

	gen byte `var'_n = 1 if `var' == "Y"
	replace `var'_n = 0 if `var' == "N"
	replace `var'_n = 99 if `var' == "U"

	drop `var'
}

drop insp_unit_vehicle_id_number insp_unit_license_state insp_unit_license

compress
save "InspectionData/Insp_outcome.dta", replace  


* merge inspection outcome into insp-crash-event panel
use "$Truck/processed_data/EventData/division_100pct/temp/Insp_Crash_100pct_pre.dta", clear

fmerge m:1 inspection_id insp_year using "$Truck/processed_data/InspectionData/Insp_outcome.dta", ///
	keepusing(viol_total oos_total driver_viol_total vehicle_viol_total) nogen keep(match master)

gen flag_oos = 1 if oos_total > 0 & oos_total ~= .

* generate inspection lead and lags
tsset new_group_id event_time

quietly forvalues i=1/14 { 
gen byte F`i'insp=F`i'.inspection
replace F`i'insp = 0 if F`i'insp == .

gen byte L`i'insp=L`i'.inspection
replace L`i'insp = 0 if L`i'insp == .
}


* 2-day bins
* combine into 2-day bins to increase power [(F14,F13)...(F2,F1),(0,L1),(L2,L3)...(L12,L13)
egen insp_m0_1=rowmax(inspection L1insp)
replace insp_m0_1=0 if insp_m0_1==.

quietly forvalues i=2(2)12 {
local k=`i'+1
egen insp_m`i'_`k'=rowmax(L`i'insp L`k'insp)
replace insp_m`i'_`k'=0 if insp_m`i'_`k'==.
}

quietly forvalues i=1(2)13 {
local k=`i'+1
egen insp_p`i'_`k'=rowmax(F`i'insp F`k'insp)
replace insp_p`i'_`k'=0 if insp_p`i'_`k'==.
}	

compress

save "$Truck/processed_data/EventData/division_100pct/Insp_Crash_100pct_outcome.dta", replace





///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
//// monthly event study2: (-12,24) event months
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////

* crash history aggregated to monthly level
use "CrashData/Crash_allVIN", clear 

foreach i of numlist 1/4 {
 
  gen crash_single`i' = 1 if event_id`i' ~= 13 & event_id`i' ~= 98 & event_id`i' ~= . 	
  
  gen crash_multi`i' = 1 if event_id`i' == 13

 }
gen crash_single = (crash_single1 == 1)
replace crash_single = 1 if crash_single2 == 1 
replace crash_single = 1 if crash_single3 == 1
replace crash_single = 1 if crash_single4 == 1
replace crash_single = 1 if vehicles_in_accident == 1

gen crash_multi = (crash_multi1 == 1)
replace crash_multi = 1 if crash_multi2 == 1 
replace crash_multi = 1 if crash_multi3 == 1
replace crash_multi = 1 if crash_multi4 == 1
replace crash_multi = 1 if crash_single ~= 1 & vehicles_in_accident > 1
replace crash_multi = 0 if crash_single == 1 

drop crash_single1 crash_single2 crash_single3 crash_single4 crash_multi1 crash_multi2 crash_multi3 crash_multi4 

gcollapse (sum) crash injuries fatalities vehicles_in_accident crash_single crash_multi, by(insp_unit_vehicle_id_number crash_year crash_month)

gen crash_ym = ym(crash_year,crash_month)
	
gen event_yearmonth = crash_ym

compress
save "CrashData/Crash_month_allVIN", replace


use "CrashData/Crash_alllic", clear

foreach i of numlist 1/4 {
 
  gen crash_single`i' = 1 if event_id`i' ~= 13 & event_id`i' ~= 98 & event_id`i' ~= . 	
  
  gen crash_multi`i' = 1 if event_id`i' == 13

 }
gen crash_single = (crash_single1 == 1)
replace crash_single = 1 if crash_single2 == 1 
replace crash_single = 1 if crash_single3 == 1
replace crash_single = 1 if crash_single4 == 1
replace crash_single = 1 if vehicles_in_accident == 1

gen crash_multi = (crash_multi1 == 1)
replace crash_multi = 1 if crash_multi2 == 1 
replace crash_multi = 1 if crash_multi3 == 1
replace crash_multi = 1 if crash_multi4 == 1
replace crash_multi = 1 if crash_single ~= 1 & vehicles_in_accident > 1
replace crash_multi = 0 if crash_single == 1 

drop crash_single1 crash_single2 crash_single3 crash_single4 crash_multi1 crash_multi2 crash_multi3 crash_multi4 

gcollapse (sum) crash injuries fatalities vehicles_in_accident crash_single crash_multi, by(insp_unit_license insp_unit_license_state crash_year crash_month)

gen crash_ym = ym(crash_year,crash_month)
	
gen event_yearmonth = crash_ym

compress
save "CrashData/Crash_month_alllic", replace


* Inspection Event Month (-12,24) Window
log using "$Truck/outputs/Log/division_100pct/monthly_100pct", append text name(monthly_100pct)

local division "d1 d2 d3 d4 d5 d6 d7 d8 d9 6state"

qui foreach k of local division {

noi di " *** division `k', VIN *****"
* create a balanced event date panel for all inspections
* first dataset: VIN all exist

use "InspectionData/Insp_`k'", clear

replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "SAME" | insp_unit_vehicle_id_number == "UNREADABLE" | insp_unit_vehicle_id_number == "NO VIN" | insp_unit_vehicle_id_number == "RR" | insp_unit_vehicle_id_number == "DRIVER/CARRIER"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKNOWN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "0"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "."
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "//"	
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "CARRIER" | insp_unit_vehicle_id_number == "DRIVER"

replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NONE")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"0000000")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"TEMP")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"UNK")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"**")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"--")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"- - -")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"...")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"///")>0
replace insp_unit_vehicle_id_number = "" if strlen(insp_unit_vehicle_id_number)<5
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"#")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NOT")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"white")>0

keep if insp_unit_vehicle_id_number ~= ""

compress
save "EventData/temp/insp_month2_`k'_VIN.dta", replace



noi di " *** division `k', license *****"

* second dataset: VIN all missing	
use "InspectionData/Insp_`k'", clear

replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "SAME" | insp_unit_vehicle_id_number == "UNREADABLE" | insp_unit_vehicle_id_number == "NO VIN" | insp_unit_vehicle_id_number == "RR" | insp_unit_vehicle_id_number == "DRIVER/CARRIER"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKNOWN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "0"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "."
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "//"	
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "CARRIER" | insp_unit_vehicle_id_number == "DRIVER"

replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NONE")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"0000000")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"TEMP")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"UNK")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"**")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"--")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"- - -")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"...")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"///")>0
replace insp_unit_vehicle_id_number = "" if strlen(insp_unit_vehicle_id_number)<5
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"#")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NOT")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"white")>0

keep if insp_unit_vehicle_id_number == ""
drop if insp_unit_license == ""

compress
save "EventData/temp/insp_month2_`k'_lic.dta", replace
}	


* append all inspection events
use "EventData/temp/insp_month2_6state_VIN.dta", clear

local division "d1 d2 d3 d4 d5 d6 d7 d8 d9"

foreach k of local division {

	append using "EventData/temp/insp_month2_`k'_VIN.dta"
	
}


replace insp_year = floor(insp_date/10000)
gen insp_month = floor((insp_date - insp_year*10000)/100)
gen insp_day = insp_date - insp_year * 10000 - insp_month * 100

gen inspection = 1

* insp_time
gen insp_start_hr = floor(insp_start_time/100)
gen insp_start_min = insp_start_time - insp_start_hr * 100

gen insp_time = mdyhms(insp_month,insp_day,insp_year,insp_start_hr,insp_start_min,0)


* use the insp_countyfips info for the first insp per month for any given truck
bys insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month: gegen temp = min(insp_time)

gen fips_temp = insp_countyfips if insp_time == temp
destring fips_temp, replace

drop insp_countyfips

bys insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month: gegen insp_countyfips = max(fips_temp) 

gcollapse (sum) inspection, by(insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month insp_countyfips)

	// there are trucks that experience > 15 inspections in a month, drop them
drop if inspection > 15

* expand to event panel
expand 37
bysort insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month: gen event_month = _n - 13

assert inrange(event_month,-12,24)

gen event_yearmonth = ym(insp_year,insp_month)
replace event_yearmonth = event_yearmonth + event_month

gen insp_ym = ym(insp_year,insp_month) 
replace insp_ym = . if event_month ~= 0

replace inspection = 0 if event_month ~= 0

compress
save "EventData/temp/insp_month2_VIN.dta", replace


use "EventData/temp/insp_month2_6state_lic.dta", clear

local division "d1 d2 d3 d4 d5 d6 d7 d8 d9"

foreach k of local division {

	append using "EventData/temp/insp_month2_`k'_lic.dta"
	
}


replace insp_year = floor(insp_date/10000)
gen insp_month = floor((insp_date - insp_year*10000)/100)
gen insp_day = insp_date - insp_year * 10000 - insp_month * 100

gen inspection = 1

* insp_time
gen insp_start_hr = floor(insp_start_time/100)
gen insp_start_min = insp_start_time - insp_start_hr * 100

gen insp_time = mdyhms(insp_month,insp_day,insp_year,insp_start_hr,insp_start_min,0)


* use the insp_countyfips info for the first insp per month for any given truck
bys insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month: gegen temp = min(insp_time)

gen fips_temp = insp_countyfips if insp_time == temp
destring fips_temp, replace

drop insp_countyfips

bys insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month: gegen insp_countyfips = max(fips_temp) 

gcollapse (sum) inspection, by(insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month insp_countyfips)

	// there are trucks that experience > 15 inspections in a month, drop them
drop if inspection > 15

* expand to event panel
expand 37
bysort insp_unit_vehicle_id_number insp_unit_license insp_unit_license_state insp_year insp_month: gen event_month = _n - 13

assert inrange(event_month,-12,24)

gen event_yearmonth = ym(insp_year,insp_month)
replace event_yearmonth = event_yearmonth + event_month

gen insp_ym = ym(insp_year,insp_month) 
replace insp_ym = . if event_month ~= 0

replace inspection = 0 if event_month ~= 0

compress
save "EventData/temp/insp_month2_lic.dta", replace





* inspection-crash event time correspondence
*VIN
use "EventData/temp/insp_month2_VIN.dta", clear

local varlist "crash fatalities injuries vehicles_in_accident crash_single crash_multi"

merge m:1 event_yearmonth insp_unit_vehicle_id_number using "CrashData/Crash_month_allVIN", nogen keep(master match) keepusing(`varlist')

foreach var of local varlist {
rename `var' `var'VIN
}

merge m:1 event_yearmonth insp_unit_license insp_unit_license_state using "CrashData/Crash_month_alllic", nogen keep(master match) keepusing(`varlist')

foreach var of local varlist {
replace `var'VIN = `var' if `var'VIN == .
}

drop `varlist'
foreach var of local varlist {
rename `var'VIN `var'
}

compress
save "EventData/temp/Insp_Crash_month2_VIN.dta", replace


*lic
use "EventData/temp/insp_month2_lic.dta", clear

local varlist "crash fatalities injuries vehicles_in_accident crash_single crash_multi"

merge m:1 event_yearmonth insp_unit_license insp_unit_license_state using "CrashData/Crash_month_alllic", nogen keep(master match) keepusing(`varlist')

save "EventData/temp/Insp_Crash_month2_lic.dta", replace


* append two datasets
use "EventData/temp/Insp_Crash_month2_lic.dta",clear
append using "EventData/temp/Insp_Crash_month2_VIN.dta"

local crash_var "crash fatalities injuries crash_single crash_multi"
foreach i of local crash_var {
	replace `i' = 0 if `i' == .
}

* gen dep & indep var
gen post_insp = (event_month >= 0 )

gen date = dofm(event_yearmonth)
format date %td
gen year = year(date)
gen month = month(date)
drop date


* gen leads and lags
drop if insp_unit_license == "" & insp_unit_vehicle_id_number == ""

gegen new_license = group(insp_unit_license_state insp_unit_license), missing
replace new_license = 0 if new_license == .
replace new_license = - new_license 

gegen new_VIN = group(insp_unit_vehicle_id_number)
replace new_VIN = new_license if insp_unit_vehicle_id_number == ""

bys new_VIN insp_year insp_month event_month: gen N = _n
	** the case is that VIN number is mis-recorded
drop if N > 1  // drop 0.2%

drop insp_unit_vehicle_id_number insp_unit_license_state insp_unit_license insp_ym N


* outside event window insp indicator
bysort new_VIN: gegen last_insp = max(ym(insp_year,insp_month))
bysort new_VIN: gegen first_insp = min(ym(insp_year,insp_month))

* drop (-12,24) of (first, last) insp
gen start = first_insp + ym(1960,12)
gen end = last_insp - 2*ym(1960,12)

gen insp_yearmonth = ym(insp_year, insp_month)

gen within = 1 if inrange(insp_yearmonth, start,end)

keep if within == 1

* generate event id
gegen new_group_id = group(new_VIN insp_year insp_month)
replace new_group_id = 0 if new_group_id == .
replace new_group_id = . if event_yearmonth == .


tsset new_group_id event_month

* generate inspection lead and lags
quietly forvalues i=1/12 {
gen byte F`i'insp=F`i'.inspection
replace F`i'insp = 0 if F`i'insp == .

gen byte L`i'insp=L`i'.inspection
replace L`i'insp = 0 if L`i'insp == .
}

quietly forvalues i=13/24 {
gen byte L`i'insp=L`i'.inspection
replace L`i'insp = 0 if L`i'insp == .
}

compress
save "$Truck/processed_data/EventData/division_100pct/Insp_Crash_month2_100pct.dta", replace



**************************************************************************
* merge in carrier census to event study data
**************************************************************************

local insp_varlist year month date dayofweek insp_p insp_m event_time post_insp new_VIN new_group_id inspection insp_time dot_number inspection_id insp_year
local crash_varlist crash injuries fatalities crash_event* crash_time 

use `insp_varlist' `crash_varlist' using "$Truck/processed_data/EventData/division_100pct/Insp_Crash_100pct_pre.dta", clear nolabel

keep if dot_number ~= .

* merge in carrier info (driver, vehicle, cargo) (cargo later, too big)
fmerge m:1 dot_number using "CensusData/carrier_census", ///
	keepusing(tot_trucks tot_buses tot_pwr fleetsize tot_cars inter_drs intra_drs avg_tld tot_drs cdl_drs drs_lt100 drs_gt100) ///
	keep(match master) nogen

fmerge m:1 inspection_id insp_year using "$Truck/processed_data/InspectionData/Insp_outcome.dta", ///
	keepusing(oos_total) nogen keep(match master)

gen flag_oos = 1 if oos_total > 0 & oos_total ~= .

compress
save "$Truck/processed_data/EventData/division_100pct/InspCrash_100pct_Carrier", replace














//////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////
/// FARS data cleaning 
//////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////




/* steps:
1. accident data file --> using st_case & year to extract crash time, fatalities, location, etc
2. vehicle data file --> using st_case & veh_no & year to extract VIN, vehicle make, model, driver, etc
3. person data file --> all persons involved (not for now)
*/


* 1. accident data file
forvalues i = 1992(1)1998 {
	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/ACCIDENT.csv", clear

	keep state county city month day year hour minute ve_forms persons fatals st_case harm_ev man_coll weather lgt_cond sp_limit
	
	compress
	
	save "$Truck/processed_data/FARS/accident`i'", replace
}


forvalues i = 1999(1)2009 {
	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/ACCIDENT.csv", clear

	keep state county city month day year hour minute ve_forms persons fatals st_case harm_ev man_coll weather lgt_cond sp_limit latitude longitud 
	 // geocode avail starting 1999

	compress
	
	save "$Truck/processed_data/FARS/accident`i'", replace
}


forvalues i = 2010(1)2018 {
	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/ACCIDENT.csv", clear

	keep state county city month day year hour minute ve_forms persons fatals st_case harm_ev man_coll weather lgt_cond latitude longitud 
	 // speed limit is moved to vehicle data files after 2010

	compress
	
	save "$Truck/processed_data/FARS/accident`i'", replace
}

use "$Truck/processed_data/FARS/accident1992.dta", clear

foreach i of numlist 1993/2018 { 
	append using "$Truck/processed_data/FARS/accident`i'"
}

save "$Truck/processed_data/FARS/accident.dta", replace



* 2. vehicle data file
forvalues i = 1996(1)2017 {

	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/VEHICLE.csv", clear

	
	if `i' == 1996 {
		keep st_case veh_no reg_stat make model body_typ vin v_config trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 veh_cf1 veh_cf2
	}
	else if `i' == 1997 {
		keep st_case veh_no reg_stat make model body_typ vin v_config trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 dr_cf4 veh_cf1 veh_cf2
	}
	else if inrange(`i',1998,2006) {
		keep st_case veh_no reg_stat make model body_typ vin v_config trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 dr_cf4 mcarr_id veh_cf1 veh_cf2
	}
	else if inrange(`i',2007,2008) {
		keep st_case veh_no reg_stat make model body_typ vin v_config trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 dr_cf4 mcarr_id mcarr_i1 mcarr_i2 veh_cf1 veh_cf2
	}
	else if `i' == 2009 {
		keep st_case veh_no reg_stat make model body_typ vin v_config trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 dr_cf4 mcarr_id mcarr_i1 mcarr_i2 speedrel veh_cf1 veh_cf2
	}
	else if inrange(`i',2010,2017) {
		keep st_case veh_no reg_stat make model body_typ vin v_config trav_sp deaths dr_drink mcarr_id mcarr_i1 mcarr_i2 speedrel dr_sf1 dr_sf2 dr_sf3 dr_sf4 vspd_lim acc_type veh_sc1 veh_sc2
		rename dr_sf1 dr_cf1
		rename dr_sf2 dr_cf2
		rename dr_sf3 dr_cf3
		rename dr_sf4 dr_cf4
		
		rename veh_sc1 veh_cf1 
		rename veh_sc2 veh_cf2
	}
	
	save "$Truck/processed_data/FARS/vehicle`i'", replace
}

foreach i of numlist 1998/2017 { 
	use "$Truck/processed_data/FARS/vehicle`i'.dta", clear

	if inrange(`i',1998,2006) {
	destring mcarr_id, force replace
}
	else if inrange(`i',2007,2017) {
	destring mcarr_id, force replace
	destring mcarr_i1, force replace
	destring mcarr_i2, force replace
}
	compress
	save "$Truck/processed_data/FARS/vehicle`i'.dta", replace

}

use "$Truck/processed_data/FARS/vehicle1996.dta", clear

gen year = 1996

foreach i of numlist 1997/2017 { 

	append using "$Truck/processed_data/FARS/vehicle`i'"
	replace year = `i' if year == .
}

save "$Truck/processed_data/FARS/vehicle.dta", replace



* 3. factor file:

forvalues i = 2010(1)2017 {

	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/FACTOR.csv", clear

	keep st_case veh_no mfactor
	
	bys st_case veh_no: gen N = _n
	
	reshape wide mfactor, i(st_case veh_no) j(N)
		
	gen year = `i'
	
	save "$Truck/processed_data/FARS/factor`i'.dta", replace

}

use "$Truck/processed_data/FARS/factor2010.dta", clear

foreach i of numlist 2011/2017 { 

	append using "$Truck/processed_data/FARS/factor`i'.dta"
}

save "$Truck/processed_data/FARS/factor.dta", replace





*** 4. person file
forvalues i = 1996(1)2017 {

	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/PERSON.csv", clear

	
	if inrange(`i',1996,1998) {
	keep st_case veh_no state per_no age sex per_typ drugs 
	}
	else if inrange(`i',1999,2017) {
	keep st_case veh_no state per_no age sex per_typ drugs	race
	}
	
	save "$Truck/processed_data/FARS/person`i'", replace
}

use "$Truck/processed_data/FARS/person1996.dta", clear

gen year = 1996

foreach i of numlist 1997/2017 { 

	append using "$Truck/processed_data/FARS/person`i'"
	replace year = `i' if year == .
}

save "$Truck/processed_data/FARS/person.dta", replace



* some of drivers info are in vehicle file:
forvalues i = 1996(1)2017 {

	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/VEHICLE.csv", clear

	
	if inrange(`i',1996,1997) {
		keep st_case veh_no cdl_stat l_endors l_compl prev_acc prev_sus prev_dwi prev_spd prev_oth
	}	
	else if inrange(`i',1998,2017) {
		keep st_case veh_no cdl_stat l_endors l_compl prev_acc prev_sus prev_dwi prev_spd prev_oth dr_hgt dr_wgt
	}		
	
	save "$Truck/processed_data/FARS/vehicle_dr`i'", replace
}

use "$Truck/processed_data/FARS/vehicle_dr1996.dta", clear

gen year = 1996

foreach i of numlist 1997/2017 { 

	append using "$Truck/processed_data/FARS/vehicle_dr`i'"
	replace year = `i' if year == .
}

save "$Truck/processed_data/FARS/vehicle_dr.dta", replace


* combine with person files
use "$Truck/processed_data/FARS/person.dta", clear

keep if per_typ == 1

fmerge 1:1 st_case veh_no year using "$Truck/processed_data/FARS/vehicle_dr.dta", nogen

compress
save "$Truck/processed_data/FARS/driver.dta", replace
	// uniquely id: st_case veh_no year



	
	
* 5. violation records
* before 2010, in vehicle files
* after 2010, in violation files
forvalues i = 1996(1)2009 {

	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/VEHICLE.csv", clear

	if inrange(`i',1996,1996) {
		keep st_case veh_no viol_chg
	}	
	else if inrange(`i',1997,2009) {
		keep st_case veh_no violchg1 violchg2 violchg3
	}	
	
	save "$Truck/processed_data/FARS/vehicle_viol`i'", replace
}

use "$Truck/processed_data/FARS/vehicle_viol1996.dta", clear

gen year = 1996

foreach i of numlist 1997/2009 { 

	append using "$Truck/processed_data/FARS/vehicle_viol`i'"
	replace year = `i' if year == .
}

replace violchg1 = viol_chg if year == 1996
replace violchg2 = 0 if year == 1996 
replace violchg3 = 0 if year == 1996 

drop viol_chg

compress
save "$Truck/processed_data/FARS/vehicle_viol.dta", replace

	
* violation files	
forvalues i = 2010(1)2017 {

	
	insheet using "$Truck/processed_data/FARS/FARS`i'NationalCSV/VIOLATN.csv", clear

	keep st_case veh_no mviolatn
	
	bys st_case veh_no: gen n = _n
	
	reshape wide mviolatn, i(st_case veh_no) j(n)
	
	save "$Truck/processed_data/FARS/violation`i'", replace
}
	
use "$Truck/processed_data/FARS/violation2010.dta", clear

gen year = 2010

foreach i of numlist 2011/2017 { 

	append using "$Truck/processed_data/FARS/violation`i'"
	replace year = `i' if year == .
}

compress
save "$Truck/processed_data/FARS/violation.dta", replace

* combine violations of all years
use "$Truck/processed_data/FARS/vehicle_viol.dta", clear

append using "$Truck/processed_data/FARS/violation.dta"

replace violchg1 = mviolatn1 if year >= 2010
replace violchg2 = mviolatn2 if year >= 2010
replace violchg3 = mviolatn3 if year >= 2010
drop mviolatn1  mviolatn2 mviolatn3
	
save "$Truck/processed_data/FARS/violation.dta", replace	
	
	
	
	



* merge vehicle data files with accident data files to find crash time

use "$Truck/processed_data/FARS/accident.dta", clear

forvalues i = 92/97 {

	replace year = 19`i' if year == `i'

}

merge 1:m st_case year using "$Truck/processed_data/FARS/vehicle.dta", assert(match) nogen

compress
save "$Truck/processed_data/FARS/fatals_FARS.dta", replace


** if body type = 50 - 79, then its bus or a truck
** if MCARR_ID (MOTOR CARRIER IDENTIFICATION NUMBER) == 11-character, then it is a motor carrier */
use "$Truck/processed_data/FARS/fatals_FARS.dta",clear

gen truck = (inrange(body_typ,50,79))

gen cmv = .
replace cmv = 1 if inrange(mcarr_i1, 1,57)
replace cmv = 1 if mcarr_id > 0 & mcarr_id~=. // some mcarr_id are unknown or none, keep them for now to see if they merge with inspection, if not merged, then drop

keep if truck == 1 | cmv == 1

* time
gen date = mdy(month,day,year)
gen crash_date = date

gen crash_time = mdyhms(month,day,year,hour,minute,0)

drop month day hour minute

* county
gen str3 county_str = string(county, "%03.0f")
gen str2 state_str = string(state, "%02.0f")

gegen crash_countyfips = concat(state_str county_str)

drop county_str state_str county state truck cmv

drop if vin == "000000000000" | vin == "888888888888" | vin == "999999999999"

bys vin crash_date crash_time crash_countyfips: drop if _n > 1 // the same crash
bys vin mcarr_id crash_date : drop if _n > 1 // the same car

*duplicates drop vin crash_date, force // 23(.05%) of crashes

* generage vin11 vin10 etc for merging 
gen vin11 = substr(vin,1,11)
gen vin10 = substr(vin,1,10)

drop if vin == ""

* combine sp_lim and vsp_lim
replace sp_limit = vspd_lim if sp_limit == .
drop vspd_lim

compress
save "$Truck/processed_data/FARS/truck_FARS.dta", replace
	// st_case, year, veh_no (or vin12) uniquely identifies an observation in this dataset

* merge in driver file
fmerge 1:1 st_case year veh_no using "$Truck/processed_data/FARS/driver.dta", keep(master match) nogen
		// note: unmerged using mostly (93%) come from non-cmv accidents

* merge in violation file	
fmerge 1:1 st_case year veh_no using "$Truck/processed_data/FARS/violation.dta",  keepusing(violchg1 violchg2 violchg3) keep(master match) nogen

	
* merge in factor data file (after 2010)
gen year = year(date)

fmerge 1:1 st_case veh_no year using "$Truck/processed_data/FARS/factor.dta", keep(master match) keepusing(mfactor1) 

drop _merge year

rename mfactor1 mfactor

save "$Truck/processed_data/FARS/truck_FARS.dta", replace
	
	




* FMSCA crash data (vin17 ~= "")
use "CrashData/Crash_allVIN", clear 

* keep 12 digit of VIN only
* drop leading 0s in insp_unit_vehicle_id_number
gen temp_insp_unit_vehicle_id_number = insp_unit_vehicle_id_number

qui count if substr(temp_insp_unit_vehicle_id_number, 1, 1) == "0" 
while r(N) { 
	replace temp_insp_unit_vehicle_id_number = substr(temp_insp_unit_vehicle_id_number, 2, .) if substr(temp_insp_unit_vehicle_id_number, 1, 1) == "0" 	
  	qui count if substr(temp_insp_unit_vehicle_id_number, 1, 1) == "0" 
}

gen vin12 = substr(temp_insp_unit_vehicle_id_number,1,12)

drop if vin12 == ""
drop temp_insp_unit_vehicle_id_number

bys vin12 crash_date crash_time crash_countyfips: gen share = _n
summ share, detail

keep if share == 1  // 169 crashes, drop < 0.01% of crash

keep crash_state insp_unit_vehicle_id_number date crash_date crash_time report_time crash_countyfips vin12 crash fatalities injuries

compress
save "CrashData/Crash_VIN17_12crosswalk", replace



use "CrashData/Crash_alllic",clear

keep if injuries > 0 | fatalities > 0

bys crash_date crash_time crash_countyfips: gen share = _n

keep if share == 1  // drop 83573 crashes, 6%

keep date crash_date crash_time crash_countyfips insp_unit_license insp_unit_license_state

compress
save "CrashData/Crash_lic_crosswalk", replace




* use dot_number & vin12
use "CrashData/Crash_all",clear

replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "SAME" | insp_unit_vehicle_id_number == "UNREADABLE" | insp_unit_vehicle_id_number == "NO VIN" | insp_unit_vehicle_id_number == "RR" | insp_unit_vehicle_id_number == "DRIVER/CARRIER"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKNOWN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "UKN" 
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "000"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "0"
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "."
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "//"	
replace insp_unit_vehicle_id_number = "" if insp_unit_vehicle_id_number == "CARRIER" | insp_unit_vehicle_id_number == "DRIVER"

replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NONE")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"0000000")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"TEMP")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"UNK")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"**")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"--")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"- - -")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"...")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"///")>0
replace insp_unit_vehicle_id_number = "" if strlen(insp_unit_vehicle_id_number)<5
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"#")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"NOT")>0
replace insp_unit_vehicle_id_number = "" if strpos(insp_unit_vehicle_id_number,"white")>0

drop if insp_unit_vehicle_id_number == ""

duplicates drop report_date report_time insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number, force 
	// if report_date & time & license are all the same, this is a single crash event

bysort report_date insp_unit_license insp_unit_license_state location: egen report_time_earlier = min(report_time)
keep if report_time_earlier == report_time
	// if the report time is different but close, and the crashes happened at the same location, this is also a single crash event	
	// keep the earlier one in terms of report_time
drop report_time_earlier

bysort date insp_unit_vehicle_id_number: egen report_time_earlier = min(report_time)
keep if report_time_earlier == report_time
	// drop location info FOR NOW: there are multiple crashes happen for the same truck on the same day but in different counties

gen crash = 1
collapse (sum) crash fatalities injuries vehicles_in_accident, by(dot_number crash_state crash_countyfips report_date report_time crash_time insp_unit_vehicle_id_number crash_year crash_month crash_day crash_date date event_id1 event_id2 event_id3 event_id4 road_surface_condition_id gvw_rating_id weather_condition light_condition)

drop if dot_number == .

duplicates drop date insp_unit_vehicle_id_number dot_number, force // same crash but event_id are recorded differently

compress
save "CrashData/Crash_VIN_dot", replace



use "CrashData/Crash_VIN_dot", clear 

* keep 12 digit of VIN only
* drop leading 0s in insp_unit_vehicle_id_number
gen temp_insp_unit_vehicle_id_number = insp_unit_vehicle_id_number

qui count if substr(temp_insp_unit_vehicle_id_number, 1, 1) == "0" 
while r(N) { 
	replace temp_insp_unit_vehicle_id_number = substr(temp_insp_unit_vehicle_id_number, 2, .) if substr(temp_insp_unit_vehicle_id_number, 1, 1) == "0" 	
  	qui count if substr(temp_insp_unit_vehicle_id_number, 1, 1) == "0" 
}

gen vin12 = substr(temp_insp_unit_vehicle_id_number,1,12)

drop if vin12 == ""
drop temp_insp_unit_vehicle_id_number


bys dot_number crash_date crash_countyfips crash_time: gen share = _n
drop if share > 1 
drop share

rename dot_number mcarr_i2

save "CrashData/Crash_VIN_dot", replace









**************************************************************************************
* finding vin17 for FARS crash using the vin17_12 cross walk
* note: can find 70% of FARS crash 

use "$Truck/processed_data/FARS/truck_FARS.dta", clear

rename vin vin12

bys vin12 crash_date crash_time crash_countyfips: gen share = _n
assert(share == 1)

merge 1:1 vin12 crash_date crash_time crash_countyfips using "CrashData/Crash_VIN17_12crosswalk", keep(master match) keepusing(insp_unit_vehicle_id_number) 
	* note: merges 60% of FARS crash
drop _merge share vin11 vin10

compress
save "$Truck/processed_data/FARS/truck_FARS.dta", replace


* use crash (county,datetime) to merge in license for FARS crash that cannot be matched using vin (due to missing vin in FMCSA crash)
use "$Truck/processed_data/FARS/truck_FARS.dta", clear

keep if insp_unit_vehicle_id_number ~= ""

save "$Truck/processed_data/FARS/truck_FARS_VIN.dta", replace  // vin17 all exist


use "$Truck/processed_data/FARS/truck_FARS.dta", clear

keep if insp_unit_vehicle_id_number == ""

bys crash_date crash_time crash_countyfips: gen share = _n

preserve
use "CrashData/Crash_lic_crosswalk",clear
gen share = 1
save "CrashData/Crash_lic_crosswalk", replace
restore

merge 1:1 crash_date crash_time crash_countyfips share using "CrashData/Crash_lic_crosswalk", keep(match master) keepusing(insp_unit_license insp_unit_license_state)
	* note: merge 13980 crashes (another 10%)
drop share _merge

append using "$Truck/processed_data/FARS/truck_FARS_VIN.dta"

save "$Truck/processed_data/FARS/truck_FARS.dta", replace




* license only
use "$Truck/processed_data/FARS/truck_FARS.dta", clear

keep if insp_unit_license ~= ""

save "$Truck/processed_data/FARS/truck_FARS_lic.dta", replace  // no vin17, but lic





*** try to use vin12, crash_date, county to merge more:
use "$Truck/processed_data/FARS/truck_FARS.dta", clear

merge 1:1 vin12 crash_date crash_time crash_countyfips using "CrashData/Crash_VIN17_12crosswalk", keep(using) keepusing(insp_unit_vehicle_id_number fatalities)

drop if year(crash_date) > 2017

keep if _merge == 2 & fatalities > 0 & insp_unit_vehicle_id_number ~= ""
drop _merge

drop if crash_countyfips == "000"

drop if substr(vin12, 12, 1) == ""

save "$Truck/processed_data/FARS/temp/check_truck_FARS.dta", replace

use "$Truck/processed_data/FARS/truck_FARS.dta", clear

merge 1:1 vin12 crash_date crash_time crash_countyfips using "CrashData/Crash_VIN17_12crosswalk", keep(master) keepusing(insp_unit_vehicle_id_number) 

duplicates drop vin12 crash_date crash_countyfips, force  // 1 obs

rename crash_time FARS_time
drop _merge insp_unit_vehicle_id_number

merge 1:1 vin12 crash_date crash_countyfips using "$Truck/processed_data/FARS/temp/check_truck_FARS.dta", keep(master match) keepusing(insp_unit_vehicle_id_number crash_time) 
  
drop _merge

save "$Truck/processed_data/FARS/temp/truck_FARS_VIN2.dta", replace

use "$Truck/processed_data/FARS/truck_FARS.dta", clear

merge 1:1 vin12 crash_date crash_time crash_countyfips using "CrashData/Crash_VIN17_12crosswalk", keep(match) keepusing(insp_unit_vehicle_id_number) nogen

append using "$Truck/processed_data/FARS/temp/truck_FARS_VIN2.dta"

compress
save "$Truck/processed_data/FARS/truck_FARS_coarse.dta", replace  // 77% FARS are found.


* use carrier_id, crash datetime, county to find more matches
use "$Truck/processed_data/FARS/truck_FARS_coarse.dta", clear

format mcarr_id %11.0f

tostring mcarr_id, replace format(%11.0f)

gen mcarr_id1 = substr(mcarr_id, 1,2)
gen mcarr_id2 = substr(mcarr_id, 3,.)

destring mcarr_id1 mcarr_id2 mcarr_id, replace
replace mcarr_i1 = mcarr_id1 if mcarr_i1 == .
replace mcarr_i2 = mcarr_id2 if mcarr_i2 == .

drop mcarr_id1 mcarr_id2

keep if mcarr_i1 == 57  // 54%

bys mcarr_i2 crash_countyfips crash_date crash_time: gen share = _n
keep if share == 1
drop share

merge 1:1 mcarr_i2 crash_countyfips crash_date crash_time using "CrashData/Crash_VIN_dot", keep(master match) keepusing(event_id1)
	* note: only merge 10 more crashes, not worth it....



****************************************************************************	
* merge with inspection data
* inspection-crash event time correspondence

local division "d1 d2 d3 d4 d5 d6 d7 d8 d9 6state"


foreach k of local division {

*VIN
use "EventData/insp_panel/insp_event_time_`k'_VIN.dta", clear

drop insp_unit_decal insp_unit_decal_number insp_state insp_date location

* fars only avail 1996 to 2017
drop if insp_year == 2018
drop if mdy(insp_month,insp_day,insp_year) > mdy(12,17,2017)  // no crash recorded in 2018
drop if mdy(insp_month,insp_day,insp_year) < mdy(1,14,1996)  // no crash recorded before 1996

* merge with FARS using vin & date
local varlist "ve_forms persons harm_ev man_coll sp_limit lgt_cond weather fatals st_case city latitude longitud veh_no make model body_typ trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 vin12 v_config dr_cf4 mcarr_id mcarr_i1 mcarr_i2 speedrel reg_stat acc_type crash_date crash_time crash_countyfips"

merge m:1 date insp_unit_vehicle_id_number using "$Truck/processed_data/FARS/truck_FARS_VIN.dta", nogen keep(master match) keepusing(`varlist')

foreach var of local varlist {
rename `var' `var'VIN
}


merge m:1 date insp_unit_license insp_unit_license_state using "$Truck/processed_data/FARS/truck_FARS_lic.dta", nogen keep(master match) keepusing(`varlist')

foreach var of varlist ve_forms persons harm_ev man_coll sp_limit lgt_cond weather fatals st_case city latitude longitud veh_no make model body_typ trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 v_config dr_cf4 mcarr_id mcarr_i1 mcarr_i2 speedrel reg_stat acc_type crash_date crash_time  {
replace `var'VIN = `var' if `var'VIN == .
}
replace crash_countyfipsVIN = crash_countyfips if crash_countyfipsVIN == ""

drop `varlist'
foreach var of local varlist {
rename `var'VIN `var'
}

compress
save "EventData/temp/Insp_FARS_`k'_VIN.dta", replace


*lic
use "EventData/insp_panel/insp_event_time_`k'_lic.dta", clear

drop insp_unit_decal insp_unit_decal_number insp_state insp_date location

* fars only avail 1996 to 2017
drop if insp_year == 2018
drop if mdy(insp_month,insp_day,insp_year) > mdy(12,17,2017)  // no crash recorded in 2018
drop if mdy(insp_month,insp_day,insp_year) < mdy(1,14,1996)  // no crash recorded before 1996

merge m:1 date insp_unit_license insp_unit_license_state using "$Truck/processed_data/FARS/truck_FARS_lic.dta", nogen keep(master match) keepusing(`varlist')

save "EventData/temp/Insp_FARS_`k'_lic.dta", replace



* append two datasets
use "EventData/temp/Insp_FARS_`k'_lic.dta",clear
append using "EventData/temp/Insp_FARS_`k'_VIN.dta"


gen inspection = 1 if event_time == 0
replace inspection = 0 if inspection == .
replace fatals = 0 if fatals == .
replace deaths = 0 if deaths == .


* gen dep & indep var
gen insp_start_hr = floor(insp_start_time/100)
gen insp_start_min = insp_start_time - insp_start_hr * 100

gen insp_time = mdyhms(insp_month,insp_day,insp_year,insp_start_hr,insp_start_min,0)

drop insp_start_hr insp_start_min

gen post_insp = (event_time >= 0 )

gen year = year(date)
gen month = month(date)
gen dayofweek = dow(date)

compress
cap mkdir "EventData/FARS"
save "EventData/FARS/Insp_FARS_`k'.dta", replace


}

* append all divisions
local insp_list "inspection_id dot_number insp_start_time insp_end_time insp_year insp_unit_license insp_unit_license_state insp_unit_vehicle_id_number insp_countyfips insp_month insp_day event_time date" 

local fars_list "vin12 harm_ev man_coll fatals st_case trav_sp deaths dr_drink dr_cf1 dr_cf2 dr_cf3 dr_cf4 speedrel crash_date crash_time crash_countyfips inspection insp_time post_insp year month dayofweek"


use `insp_list' `fars_list' using "EventData/FARS/Insp_FARS_6state.dta", clear

local division "d1 d2 d3 d4 d5 d6 d7 d8 d9"
foreach k of local division {
	append using "$Truck/processed_data/EventData/FARS/Insp_FARS_`k'.dta", keep(`insp_list' `fars_list')
}


* use period after year 2000
keep if year >= 2000

gegen new_license = group(insp_unit_license_state insp_unit_license)
replace new_license = 0 if new_license == .
replace new_license = - new_license  

gegen new_VIN = group(insp_unit_vehicle_id_number)
replace new_VIN = new_license if insp_unit_vehicle_id_number == ""

gegen new_group_id = group(new_VIN inspection_id insp_year)
replace new_group_id = 0 if new_group_id == .
replace new_group_id = . if event_time == .


* outside event window insp indicator
bysort new_VIN: gegen last_insp = max(mdy(insp_month, insp_day, insp_year))
bysort new_VIN: gegen first_insp = min(mdy(insp_month, insp_day, insp_year))

gen insp_p = 0
gen insp_m = 0
replace insp_p = 1 if mdy(insp_month, insp_day, insp_year) > first_insp
replace insp_m = 1 if mdy(insp_month, insp_day, insp_year) < last_insp


* drop events within 3,6,12,18,24 hours of inspection
foreach t of numlist 18 24 {
	local hr_`t' = mdyhms(1,1,2000,`t',0,0)-mdyhms(1,1,2000,0,0,0)

	gen temp_crash_`t'hr = 1 if crash_time >= insp_time - `hr_`t'' & crash_time < insp_time
	bysort new_group_id: gegen crash_event_`t'hr = max(temp_crash_`t'hr)
}

drop temp_crash*

* generate inspection lead and lags
tsset new_group_id event_time

quietly forvalues i=1/14 { 
gen byte F`i'insp=F`i'.inspection
*replace F`i'insp = 0 if F`i'insp == .

gen byte L`i'insp=L`i'.inspection
*replace L`i'insp = 0 if L`i'insp == .
}


* 2-day bins
* combine into 2-day bins to increase power [(F14,F13)...(F2,F1),(0,L1),(L2,L3)...(L12,L13)
egen insp_m0_1=rowmax(inspection L1insp)
replace insp_m0_1=0 if insp_m0_1==.

quietly forvalues i=2(2)12 {
local k=`i'+1
egen insp_m`i'_`k'=rowmax(L`i'insp L`k'insp)
replace insp_m`i'_`k'=0 if insp_m`i'_`k'==.
}

quietly forvalues i=1(2)13 {
local k=`i'+1
egen insp_p`i'_`k'=rowmax(F`i'insp F`k'insp)
replace insp_p`i'_`k'=0 if insp_p`i'_`k'==.
}	

* drop 1-day leads and lags
local insp_F
forvalues t = 14(-1)1 {
local insp_F `insp_F' F`t'insp
}
forvalues t = 1(1)14 {
local insp_F `insp_F' L`t'insp
}	

drop `insp_F'


* merge in ve_forms (the # of moving vehicles)
merge m:1 st_case vin12 date using "$Truck/processed_data/FARS/truck_FARS.dta", keep(master match) keepusing(ve_forms mfactor veh_cf1 veh_cf2) nogen

save "$Truck/processed_data/EventData/FARS/Insp_FARS_100pct.dta", replace




* merge in driver info and violation info
use "$Truck/processed_data/EventData/FARS/Insp_FARS_100pct.dta",clear

merge m:1 st_case vin12 date using "$Truck/processed_data/FARS/truck_FARS.dta", keep(master match) keepusing(age sex drugs race dr_wgt dr_hgt cdl_stat l_compl prev_acc prev_sus prev_dwi prev_spd prev_oth violchg1 violchg2 violchg3) nogen

compress
save "$Truck/processed_data/EventData/FARS/Insp_FARS_100pct.dta", replace



//////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
/////////// Texas DOT crash data
//////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////

/* steps:
1. crash file --> using crash_id, cmv_involv_fl to extract crash time, county, harm_evnt_id

2. unit file --> using st_case & veh_no & year to extract VIN, vehicle make, model, driver, etc
3. person data file --> all persons involved (not for now)
*/



* 1. crash file

* Begin/End data yr
local fromyr 	2010
local thruyr 	2018

* Year loop
forvalues y = `fromyr'/`thruyr' {

	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "*crash*"

	local i = 1
	
	foreach file in `files' {
		
		insheet using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'", clear
		
		keep crash_id crash_fatal_fl cmv_involv_fl crash_date crash_time rpt_cris_cnty_id harm_evnt_id
		
		keep if cmv_involv_fl == "Y"
		drop cmv_involv_fl 
		
		compress
		save "$Truck/processed_data/TxDOT_CRIS/`y'/crash`i'.dta", replace
		
		local i = `i' + 1
	}
	*** End of season loop
	
	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "crash*"
	
	clear
	foreach file in `files' {
		append using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
		erase "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
	}		
	save "$Truck/processed_data/TxDOT_CRIS/`y'/crash.dta", replace
	
	isid crash_id
}
	

	
	
	
	
* 2. unit file 

* Begin/End data yr
local fromyr 	2010
local thruyr 	2018

* Year loop
forvalues y = `fromyr'/`thruyr' {
	
	* drop previously generated unit file
	cap erase "$Truck/processed_data/TxDOT_CRIS/`y'/unit.dta"

	* collect all season unit files
	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "*unit*"

	local i = 1
	
	foreach file in `files' {
		
		insheet using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'", clear
		
		keep crash_id unit_nbr vin veh_cmv_fl cmv_fiveton_fl cmv_hazmat_fl cmv_nine_plus_pass_fl cmv_veh_oper_id cmv_carrier_id_type_id cmv_veh_type_id cmv_evnt* contrib_factr* veh_dfct* tot_injry_cnt death_cnt veh_transp_name veh_transp_dest veh_dmag_scl*
		
		keep if veh_cmv_fl == "Y"
		drop veh_cmv_fl 
		
		compress
		save "$Truck/processed_data/TxDOT_CRIS/`y'/unit`i'.dta", replace
		
		local i = `i' + 1
	}
	*** End of season loop
	
	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "unit*"
	
	clear
	foreach file in `files' {
		append using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
		erase "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
	}		
	save "$Truck/processed_data/TxDOT_CRIS/`y'/unit.dta", replace
	
	isid crash_id unit_nbr
}
	




* 3. person file

* Begin/End data yr
local fromyr 	2010
local thruyr 	2018

* Year loop
forvalues y = `fromyr'/`thruyr' {

	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "*person*"

	local i = 1
	
	foreach file in `files' {
		
		insheet using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'", clear
		
		keep crash_id unit_nbr prsn_nbr prsn_type_id prsn_age prsn_ethnicity_id prsn_gndr_id prsn_alc_rslt_id prsn_drg_rslt_id
		
		keep if prsn_type_id == 1  // driver
		drop prsn_type_id 
		
		compress
		save "$Truck/processed_data/TxDOT_CRIS/`y'/person`i'.dta", replace
		
		local i = `i' + 1
	}
	*** End of season loop
	
	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "person*"
	
	clear
	foreach file in `files' {
		append using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
		erase "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
	}		
	save "$Truck/processed_data/TxDOT_CRIS/`y'/person.dta", replace
	
	isid crash_id unit_nbr prsn_nbr
}
	

	
* 4. charge files (edited 202410)

* Begin/End data yr
local fromyr 	2010
local thruyr 	2018

* Year loop
forvalues y = `fromyr'/`thruyr' {

	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "*charges*"

	local i = 1 
	
	foreach file in `files' {
		
		insheet using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'", clear
		
		keep crash_id unit_nbr prsn_nbr charge
		
		keep if charge ~= "NO CHARGES"

		gen Icharge = 1

		gen charge_veh = 1 if strpos(charge,"INSPECTION")>0
		replace charge_veh = 1 if strpos(charge,"LICENSE")>0
		replace charge_veh = 1 if strpos(charge,"REGIS")>0
		replace charge_veh = 1 if strpos(charge,"IMPROPER")>0
		replace charge_veh = 1 if strpos(charge,"LIABILITY")>0
		replace charge_veh = 1 if strpos(charge,"INSURANCE")>0
		replace charge_veh = 1 if strpos(charge,"FINANCIAL RESPONSIBILITY")>0
		replace charge_veh = 1 if strpos(charge,"PERMIT")>0
		replace charge_veh = 1 if strpos(charge,"LIGHT")>0
		replace charge_veh = 1 if strpos(charge,"MAINTAINANCE")>0
		replace charge_veh = 1 if strpos(charge,"EQUIPMENT")>0
		replace charge_veh = 1 if strpos(charge,"DL")>0
		replace charge_veh = 1 if strpos(charge,"DEFECT")>0
		replace charge_veh = 1 if strpos(charge,"BRAKE")>0
		 
		gcollapse (min) charge_veh Icharge, by(crash_id unit_nbr prsn_nbr)
						
		compress
		save "$Truck/processed_data/TxDOT_CRIS/`y'/charges`i'.dta", replace
		
		local i = `i' + 1
	}
	*** End of season loop
	
	local files: dir "$Truck/processed_data/TxDOT_CRIS/`y'" files "charges*"
	
	clear
	foreach file in `files' {
		append using "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
		erase "$Truck/processed_data/TxDOT_CRIS/`y'/`file'"
	}		
	save "$Truck/processed_data/TxDOT_CRIS/`y'/charge_new.dta", replace
	
	isid crash_id unit_nbr prsn_nbr 
}






*** combine crash, unit, person files

* Begin/End data yr
local fromyr 	2010
local thruyr 	2018

local file "crash unit person"

foreach i in `file' {
	clear

	forvalues y = `fromyr'/`thruyr' {

		append using "$Truck/processed_data/TxDOT_CRIS/`y'/`i'.dta"
	}
	
	compress
	save "$Truck/processed_data/TxDOT_CRIS/`i'.dta", replace
}



use "$Truck/processed_data/TxDOT_CRIS/crash.dta", clear

merge 1:m crash_id using "$Truck/processed_data/TxDOT_CRIS/unit.dta", keep(match) nogen

merge 1:m crash_id unit_nbr using "$Truck/processed_data/TxDOT_CRIS/person.dta", keep(master match) nogen


replace prsn_nbr = -1 if prsn_nbr == .  // not matched --> no data


isid crash_id unit_nbr prsn_nbr

* format crash date
gen date = date(crash_date, "MDY")
format date %td

compress
save "$Truck/processed_data/TxDOT_CRIS/crashData.dta", replace



* combine charge file
use "$Truck/processed_data/TxDOT_CRIS/2010/charge_new.dta", clear

forvalues y = 2011/2018 {
	append using "$Truck/processed_data/TxDOT_CRIS/`y'/charge_new.dta"

}

* keep if veh is cmv
fmerge 1:1 crash_id unit_nbr prsn_nbr using "$Truck/processed_data/TxDOT_CRIS/crashData.dta", keep(match) nogen

gcollapse (max) Icharge charge_veh, by(crash_id unit_nbr)

save "$Truck/processed_data/TxDOT_CRIS/charge_new.dta", replace


* crashes with vin all exist
use "$Truck/processed_data/TxDOT_CRIS/crash.dta", clear

merge 1:m crash_id using "$Truck/processed_data/TxDOT_CRIS/unit.dta", keep(match) nogen

* format crash date
gen date = date(crash_date, "MDY")
format date %td

* drop mis-record vin
drop if vin == ""
drop if strpos(vin,"00000000000000")>0
drop if vin == "0"
drop if vin == "00000"
drop if vin == "0000000"
drop if vin == "00000000"
drop if vin == "0000000000000"

* federal recordable: injuries or towed	
gen tow = 1
replace tow = 0 if veh_transp_name == "" |  veh_transp_dest == ""
replace tow = 0 if strpos(veh_transp_name,"DRIV")>0 |  strpos(veh_transp_dest,"DRIV")>0
replace tow = 0 if strpos(veh_transp_name,"NOT TOWED")>0 | strpos(veh_transp_dest,"NOT TOWED")>0
replace tow = 0 if strpos(veh_transp_name,"WRECKER")>0 | strpos(veh_transp_dest,"WRECKER")>0

drop veh_transp_name veh_transp_dest

gen fed_record = 1 if tow == 1 
replace fed_record = 1 if tot_injry_cnt > 0 & tot_injry_cnt ~= .
replace fed_record = 1 if death_cnt > 0 & death_cnt ~= .
		
* check uniqueness
bys date vin: gen dup = _n
drop if dup >= 2  // < .5%

isid date vin

compress
save "$Truck/processed_data/TxDOT_CRIS/crash_vin.dta", replace



* compare with FMCSA crash data

use "$Truck/processed_data/CrashData/Crash_allVIN", clear

keep if crash_state == "TX"

keep if inrange(crash_year,2010,2018)

rename insp_unit_vehicle_id_number vin

keep report_time crash_countyfips event_id1 event_id2 event_id3 event_id4 crash fatalities injuries vin date

merge 1:1 vin date using "$Truck/processed_data/TxDOT_CRIS/crash_vin.dta"

rename (vin _merge) (insp_unit_vehicle_id_number merge_fmcsa_txdot)

compress
save "$Truck/processed_data/TxDOT_CRIS/crash_vin2.dta", replace


* merge with inspection data

use "EventData/insp_panel/insp_event_time_6state_VIN.dta", clear

keep if insp_state == "TX"

drop insp_unit_decal insp_unit_decal_number insp_state insp_date location

merge m:1 date insp_unit_vehicle_id_number using "$Truck/processed_data/TxDOT_CRIS/crash_vin2.dta"

rename _merge merge_insp_txdot

* in insp
gen inspection = 1 if event_time == 0
replace inspection = 0 if inspection == .

* in fmcsa
replace crash = 0 if crash == .
replace injuries = 0 if injuries == .
replace fatalities = 0 if fatalities == .

* in txdot
gen crash_tx = (crash_id ~=.)
replace tot_injry_cnt = 0 if tot_injry_cnt == .
replace death_cnt = 0 if death_cnt == .



* gen dep & indep var
gen insp_start_hr = floor(insp_start_time/100)
gen insp_start_min = insp_start_time - insp_start_hr * 100

gen insp_time = mdyhms(insp_month,insp_day,insp_year,insp_start_hr,insp_start_min,0)

drop insp_start_hr insp_start_min

gen post_insp = (event_time >= 0 )

gen year = year(date)
gen month = month(date)
gen dayofweek = dow(date)



compress
save "$Truck/processed_data/TxDOT_CRIS/Insp_Crash_Event_tx.dta", replace




* generate analysis data
use "$Truck/processed_data/TxDOT_CRIS/Insp_Crash_Event_tx.dta", clear

gegen new_VIN = group(insp_unit_vehicle_id_number)

gegen new_group_id = group(new_VIN inspection_id insp_year)
replace new_group_id = 0 if new_group_id == .
replace new_group_id = . if event_time == .


* outside event window insp indicator
bysort new_VIN: gegen last_insp = max(mdy(insp_month, insp_day, insp_year))
bysort new_VIN: gegen first_insp = min(mdy(insp_month, insp_day, insp_year))

gen insp_p = 0
gen insp_m = 0
replace insp_p = 1 if mdy(insp_month, insp_day, insp_year) > first_insp
replace insp_m = 1 if mdy(insp_month, insp_day, insp_year) < last_insp


* crash time
rename crash_time crash_tx_time

gen temp = crash_date + " " + crash_tx_time

gen crash_time = clock(temp, "MDYhm")
format crash_time %tc
format insp_time %tc

drop temp

* drop events within 3,6,12,18,24 hours of inspection
foreach t of numlist 12 18 {
	local hr_`t' = mdyhms(1,1,2000,`t',0,0)-mdyhms(1,1,2000,0,0,0)

	gen temp_crash_`t'hr = 1 if crash_time >= insp_time - `hr_`t'' & crash_time < insp_time
	bysort new_group_id: gegen crash_event_`t'hr = max(temp_crash_`t'hr)
}

drop temp_crash*


tsset new_group_id event_time

quietly forvalues i=1/14 { 
gen byte F`i'insp=F`i'.inspection
replace F`i'insp = 0 if F`i'insp == .

gen byte L`i'insp=L`i'.inspection
replace L`i'insp = 0 if L`i'insp == .
}


* 2-day bins
* combine into 2-day bins to increase power [(F14,F13)...(F2,F1),(0,L1),(L2,L3)...(L12,L13)
egen insp_m0_1=rowmax(inspection L1insp)
replace insp_m0_1=0 if insp_m0_1==.

quietly forvalues i=2(2)12 {
local k=`i'+1
egen insp_m`i'_`k'=rowmax(L`i'insp L`k'insp)
replace insp_m`i'_`k'=0 if insp_m`i'_`k'==.
}

quietly forvalues i=1(2)13 {
local k=`i'+1
egen insp_p`i'_`k'=rowmax(F`i'insp F`k'insp)
replace insp_p`i'_`k'=0 if insp_p`i'_`k'==.
}	


* merge in fed_record 
merge m:1 date insp_unit_vehicle_id_number using "$Truck/processed_data/TxDOT_CRIS/crash_vin2.dta", keepusing(fed_record veh_dmag_scl*) keep(master match) nogen

* add more into fed_record
replace fed_record = 1 if crash == 1  // if appears in fmcsa, then it's federal recordable
replace fed_record = 1 if inrange(veh_dmag_scl_1, 5,7)
replace fed_record = 1 if inrange(veh_dmag_scl_2, 5,7)

count if crash_tx == 1 & fed_record == 1 & inrange(event_time,-14,13) & crash_event_18hr ~= 1  //765


* merge in charge files
fmerge m:1 crash_id unit_nbr using "$Truck/processed_data/TxDOT_CRIS/charge.dta", nogen keep(master match) keepusing(Icharge)

compress
save "$Truck/processed_data/TxDOT_CRIS/Insp_Crash_Event_tx.dta", replace






//////////////////////////////////////////////////////////////////////////
///////////////// FHWA traffic class data, from CD 
//////////////////////////////////////////////////////////////////////////


{/* station description data */

* Begin/End data yr
local fromyr 	2012
local thruyr 	2018

/* Time the code
local time_start = clock("`c(current_date)' `c(current_time)'", "DMY hms")
di %tc `time_start'
*/

* create a new directory for tmas in processed_data
cap mkdir "$Truck/processed_data/tmas"

* Station-level data
if 1 {
	
	cap mkdir "$Truck/processed_data/tmas/station"
	
	* Year loop
	qui forvalues y = `fromyr'/`thruyr' {
		
		noisily di "*** Processing station level data for y`y'..."
		
		* from FOIA disk, station data stored in state-segments
*		if inrange(`y',2004,2010) {
			cap mkdir "$Truck/processed_data/tmas/station/`y'"
			
			local files: dir "$Truck/raw_data/TrafficClass/`y'/1. Station" files "*.STA"
			
			foreach file in `files' {
				infile using "$Truck/scripts/station.dct", using("$Truck/raw_data/TrafficClass/`y'/1. Station/`file'") clear
				save "$Truck/processed_data/tmas/station/`y'/`file'.dta", replace
			}
			*** End of state loop
			
			local files: dir "$Truck/processed_data/tmas/station/`y'" files "*.dta"
			clear
			foreach file in `files' {
				append using "$Truck/processed_data/tmas/station/`y'/`file'"
				erase "$Truck/processed_data/tmas/station/`y'/`file'"
			}
			
			rmdir  "$Truck/processed_data/tmas/station/`y'"
*		}




		/* Post-2011 from DOT web, station data in one file
		else {
			
			! "`7z'" e "$DOT/data/raw/tmas/`y'/`y'_station_data.zip" -o"$DOT/data/proc/tmas/station"
			if `y'==2011 {
				infile using "$DOT/scripts/1_raw_to_proc/x_tmas/station.dct", using("$DOT/data/proc/tmas/station/TMAS station data year `y' all states.sta") clear
				erase "$DOT/data/proc/tmas/station/TMAS station data year `y' all states.sta"
			}
			else {
				infile using "$DOT/scripts/1_raw_to_proc/x_tmas/station.dct", using("$DOT/data/proc/tmas/station/TMAS`y'.sta") clear
				erase "$DOT/data/proc/tmas/station/TMAS`y'.sta"
			}
		}*/
		
		* QA
		* drop if recordtype == "" | recordtype == ""
		*assert recordtype=="S"
		drop if recordtype ~= "S"
		* 	Note: 2015/16 have some empty records (recordtype == "") 
		* 		  17/18 have a 
		
		* if duplicates in all variables
		duplicates tag, gen(duplicates) 
		noisily tab duplicates 
		
		bys fips_st stationid dir_travel lane_travel yr_data function_class ///
			: gen tmpvar_Nrecord = _N
		noisily tab tmpvar_Nrecord

		* 	Note: 2016/17/18 record every station twice
		bys fips_st stationid dir_travel lane_travel yr_data function_class: keep if _n==1
		drop tmpvar_Nrecord duplicates
		
		compress
		save "$Truck/processed_data/tmas/station/station_`y'.dta", replace
	}
	*** End of year loop
	
}
}

* drop function_class because it is not in the class data
if 1 {

	forvalues y = `fromyr'/`thruyr' {
		
		use "$Truck/processed_data/tmas/station/station_`y'.dta", clear
		
		* drop duplicates in station_code & county
		duplicates drop fips_st stationid dir_travel lane_travel yr_data fips_cnty, force

		bys fips_st stationid dir_travel lane_travel yr_data: keep if _n == _N

		save "$Truck/processed_data/tmas/station/station_`y'_nofunc.dta", replace

}

* Note: data entry error in 2013
use "$Truck/processed_data/tmas/station/station_2013_nofunc.dta", clear

duplicates drop fips_st stationid dir_travel lane_travel, force

save "$Truck/processed_data/tmas/station/station_2013_nofunc.dta", replace

}



/* traffic class data */

* Begin/End data yr
local fromyr 	2012
local thruyr 	2018


* Station-level data
if 1 {
	
	cap mkdir "$Truck/processed_data/tmas/class"
	
	* Year loop
	qui forvalues y = `fromyr'/`thruyr' {
		
		noisily di "*** Processing station level vehicle class data for y`y'..."
		
		cap mkdir "$Truck/processed_data/tmas/class/`y'"

		* from FOIA disk, station data stored in state-segments by month

			* Loop over monthly directorys
			local dirs: dir "$Truck/raw_data/TrafficClass/`y'/3. Class/ByMonth" dirs "*"
			
			foreach dir in `dirs' {
				
				* Loop over state files in a given month folder
				local files: dir "$Truck/raw_data/TrafficClass/`y'/3. Class/ByMonth/`dir'" files "*(TMAS).CLS"
				
				foreach file in `files' {
					infile using "$Truck/scripts/class.dct", using("$Truck/raw_data/TrafficClass/`y'/3. Class/ByMonth/`dir'/`file'") clear
					save "$Truck/processed_data/tmas/class/`y'/tmpfile_`file'.dta", replace
				}
				
				local files: dir "$Truck/processed_data/tmas/class/`y'" files "tmpfile_*.dta"
				clear
				foreach file in `files' {
					append using "$Truck/processed_data/tmas/class/`y'/`file'"
					erase "$Truck/processed_data/tmas/class/`y'/`file'"
				}
				
				save "$Truck/processed_data/tmas/class/`y'/volume_`dir'.dta", replace
				*** End of state file loop
				
			}
			
			local files: dir "$Truck/processed_data/tmas/class/`y'" files "volume_*.dta"
			clear
			foreach file in `files' {
				append using "$Truck/processed_data/tmas/class/`y'/`file'"
				erase "$Truck/processed_data/tmas/class/`y'/`file'"
			}
			*** End of monthly directory loop
			
			rmdir "$Truck/processed_data/tmas/class/`y'"

		* QA
		assert recordtype =="C"
		
		duplicates tag, gen(duplicates) 
		noisily tab duplicates 
		
		bys fips_st stationid dir_travel lane_travel yr_data month_data day_data hour_data ///
			: keep if _n==1
		drop duplicates
		
		destring total_volume class*, replace force
		
		compress
		save "$Truck/processed_data/tmas/class/class_`y'.dta", replace
	}
	*** End of year loop
	
}
	
* extract 2016 using updated data from disk	
local y = "2016Update"

		cap mkdir "$Truck/processed_data/tmas/class/`y'"
			* Loop over monthly directorys
			local dirs: dir "$Truck/raw_data/TrafficClass/`y'/3. Class/ByMonth" dirs "*"
			
			foreach dir in `dirs' {
				
				* Loop over state files in a given month folder
				local files: dir "$Truck/raw_data/TrafficClass/`y'/3. Class/ByMonth/`dir'" files "*(TMAS).CLS"
				
				foreach file in `files' {
					infile using "$Truck/scripts/class.dct", using("$Truck/raw_data/TrafficClass/`y'/3. Class/ByMonth/`dir'/`file'") clear
					save "$Truck/processed_data/tmas/class/`y'/tmpfile_`file'.dta", replace
				}
				
				local files: dir "$Truck/processed_data/tmas/class/`y'" files "tmpfile_*.dta"
				clear
				foreach file in `files' {
					append using "$Truck/processed_data/tmas/class/`y'/`file'"
					erase "$Truck/processed_data/tmas/class/`y'/`file'"
				}
				
				save "$Truck/processed_data/tmas/class/`y'/volume_`dir'.dta", replace
				*** End of state file loop
				
			}
			
			local files: dir "$Truck/processed_data/tmas/class/`y'" files "volume_*.dta"
			clear
			foreach file in `files' {
				append using "$Truck/processed_data/tmas/class/`y'/`file'"
				erase "$Truck/processed_data/tmas/class/`y'/`file'"
			}
			*** End of monthly directory loop
			
			rmdir "$Truck/processed_data/tmas/class/`y'"

		* QA
		assert recordtype =="C"
		
		duplicates tag, gen(duplicates) 
		noisily tab duplicates   // 4% duplicates
		
		bys fips_st stationid dir_travel lane_travel yr_data month_data day_data hour_data ///
			: keep if _n==1
		drop duplicates
		
		destring total_volume class* yr_data, replace force
		replace yr_data = 2016 if yr_data == 16
		
		compress
		save "$Truck/processed_data/tmas/class/class_2016.dta", replace
	
	
	
	
	
	
* aggregate to county-hour 
log using "$Truck/outputs/Log/class_cnty_hr", append text name(aggregation)


* Begin/End data yr
local fromyr 	2012
local thruyr 	2018


* county-level data
if 1 {
	
	cap mkdir "$Truck/processed_data/tmas/class/temp" 
	
	* Year loop
	qui forvalues y = `fromyr'/`thruyr' {
		
		noisily di "*** Processing vehicle class data for y`y'..."
		
		use "$Truck/processed_data/tmas/class/class_`y'.dta", clear
		
		* merge in countyfips from station data
		noisily fmerge m:1 fips_st stationid dir_travel lane_travel yr_data using "$Truck/processed_data/tmas/station/station_`y'_nofunc.dta", keep(match master) keepusing(fips_cnty)
			
		* aggregate to cnty-hour	
		collapse (sum) total_volume class*, by(fips_st fips_cnty dir_travel month_data day_data hour_data yr_data)
		
		destring yr_data, force replace
		
		save "$Truck/processed_data/tmas/class/temp/cntyXhour_`y'.dta", replace
	}
	
	local files: dir "$Truck/processed_data/tmas/class/temp" files "cntyXhour*.dta"
	
	clear
	foreach file in `files' {
		append using "$Truck/processed_data/tmas/class/temp/`file'"
		*erase "$Truck/processed_data/tmas/class/temp/`file'"
	}
	*** End of Year loop
	
	* label veh class
	local class "motorcycle passenger 4_tire bus truck1 truck2 truck3 truck4 truck5 truck6 truck7 truck8 truck9 unclass1 unclass2" 

	local v = 1
	foreach i in `class' {
		label var class_`v'_count "`i'"
		local v = `v' + 1
		}
	
	compress
	save "$Truck/processed_data/tmas/class/class_cntyXhour.dta", replace
}
	

use "$Truck/processed_data/tmas/class/class_cntyXhour.dta", clear

* total truck count
gen truck_count = class_4_count + class_5_count + class_6_count + class_7_count + class_8_count + class_9_count + ///
	class_10_count + class_11_count + class_12_count + class_13_count

drop class_14_count class_15_count  // both are 0 for all obv

egen countyfips = concat(fips_st fips_cnty)

destring month_data day_data hour_data, force replace
gen date_td = mdy(month_data, day_data, yr_data)

save "$Truck/processed_data/tmas/class/class_cntyXhour.dta", replace

drop if fips_cnty == ""

collapse (sum) total_volume truck_count class*, by(fips_st month_data day_data hour_data fips_cnty yr_data countyfips date_td)

gen count_added = class_1_count + class_2_count + class_3_count + class_4_count + class_5_count + class_6_count + class_7_count + class_8_count + class_9_count + class_10_count + class_11_count + class_12_count + class_13_count		
replace count_added = total_volume if count_added == . & total_volume ~= -1 & total_volume ~= .  // Note: no class counts, only total volume is available 

compress
save "$Truck/processed_data/tmas/class/class_cntyXhour_1.dta", replace









* aggregate to station-daily

* Begin/End data yr
local fromyr 	2012
local thruyr 	2018


* station-level data
if 1 {
	
	cap mkdir "$Truck/processed_data/tmas/class/temp" 
	
	* Year loop
	qui foreach y of numlist 2012/2015 2017/2018 {
		
		noisily di "*** Processing vehicle class data for y`y'..."
		
		use "$Truck/processed_data/tmas/class/class_`y'.dta", clear
		
		* merge in countyfips from station data
		noisily fmerge m:1 fips_st stationid dir_travel lane_travel yr_data using "$Truck/processed_data/tmas/station/station_`y'_nofunc.dta", ///
			keep(match master) keepusing(fips_cnty num_lane_vehclass method_vehclass alg_vehclass classsys_vehclass)
		
		* add-up counts
		gen count_added = class_1_count + class_2_count + class_3_count + class_4_count + class_5_count + class_6_count + class_7_count + class_8_count + class_9_count + class_10_count + class_11_count + class_12_count + class_13_count		
		replace count_added = total_volume if count_added == . & total_volume ~= -1 & total_volume ~= .  // Note: no class counts, only total volume is available 
		assert(count_added >= 0 | count_added == .) 
		
		* aggregate to station-daily		
		gen N_stationXhr = 1 if count_added ~= .
		
		collapse (sum) total_volume count_added class_* N_stationXhr, by(fips_st fips_cnty stationid dir_travel lane_travel num_lane_vehclass method_vehclass alg_vehclass classsys_vehclass month_data day_data yr_data)
		
		save "$Truck/processed_data/tmas/class/temp/stationXday_`y'.dta", replace
	}
	
	local files: dir "$Truck/processed_data/tmas/class/temp" files "stationXday*.dta"
	
	clear
	foreach file in `files' {
		append using "$Truck/processed_data/tmas/class/temp/`file'"
		*erase "$Truck/processed_data/tmas/class/temp/`file'"
	}
	*** End of Year loop
	
	* label veh class
	local class "motorcycle passenger 4_tire bus truck1 truck2 truck3 truck4 truck5 truck6 truck7 truck8 truck9 unclass1 unclass2" 

	local v = 1
	foreach i in `class' {
		label var class_`v'_count "`i'"
		local v = `v' + 1
		}

	* total truck count
	gen truck_count = class_4_count + class_5_count + class_6_count + class_7_count + class_8_count + class_9_count + ///
		class_10_count + class_11_count + class_12_count + class_13_count

	drop class_14_count class_15_count  // both are 0 for all obv


	drop if fips_cnty == ""
	egen countyfips = concat(fips_st fips_cnty)

	destring month_data day_data yr_data, force replace
	replace yr_data = yr_data + 2000
	gen date_td = mdy(month_data, day_data, yr_data)

	compress
	save "$Truck/processed_data/tmas/class/class_stationXday.dta", replace
}
	





































	